04 - Multiclass logistic regression from scratch


In [1]:
from __future__ import print_function
import numpy as np
import mxnet as mx
from mxnet import gluon

In [2]:
# Setting random seed
mx.random.seed(1)

In [3]:
# Set context
data_ctx = mx.cpu()
model_ctx = mx.cpu()

Data preprocessing


In [4]:
# Transforming the data from range 0-255 to 0-1 
def transform(data, label):
    return data.astype(np.float32) / 255, label.astype(np.float32)

Loading MNIST dataset


In [5]:
mnist_train = gluon.data.vision.MNIST(train=True, 
                                      transform=transform)
mnist_test = gluon.data.vision.MNIST(train=False, 
                                     transform=transform)

In [6]:
image, label = mnist_train[0]

In [7]:
image.shape


Out[7]:
(28, 28, 1)

In [8]:
label


Out[8]:
5.0

Showing data example


In [9]:
im = mx.nd.tile(image, (1,1,3))
print(im.shape)


(28, 28, 3)

In [10]:
import matplotlib.pyplot as plt
plt.imshow(im.asnumpy())
plt.show()


Data Information / Model Parameters


In [11]:
num_inputs = 784
num_outputs = 10
num_examples = 60000

Data Iterators


In [12]:
batch_size = 64
train_data = gluon.data.DataLoader(dataset=mnist_train, 
                                   batch_size=batch_size, 
                                   shuffle=True)
test_data = gluon.data.DataLoader(dataset=mnist_test, 
                                  batch_size=batch_size, 
                                  shuffle=False)

Model Parameters


In [13]:
W = mx.nd.random_normal(shape=(num_inputs, num_outputs),
                        ctx=model_ctx)
b = mx.nd.random_normal(shape=num_outputs,
                        ctx=model_ctx)

params = [W, b]

Gradients


In [14]:
for param in params:
    param.attach_grad()

Softmax


In [15]:
def softmax(y_linear):
    exp = mx.nd.exp(y_linear - mx.nd.max(y_linear, axis=1).reshape((-1, 1)))
    norms = mx.nd.sum(exp, 
                      axis=1).reshape((-1, 1))
    return exp / norms

Example


In [16]:
sample_y_linear = mx.nd.random_normal(shape=(1, 3))
sample_yhat = softmax(sample_y_linear)
print(sample_y_linear)
print(sample_yhat)


[[1.8619109 0.9664041 0.4839427]]
<NDArray 1x3 @cpu(0)>

[[0.6022316  0.24595171 0.1518167 ]]
<NDArray 1x3 @cpu(0)>

In [17]:
print(mx.nd.sum(sample_yhat, axis=1))


[1.]
<NDArray 1 @cpu(0)>

Define the model


In [18]:
def net(X):
    y_linear = mx.nd.dot(X, W) + b
    yhat = softmax(y_linear)
    return yhat

Cross-entropy loss function


In [19]:
def cross_entropy(yhat, y):
    return - mx.nd.sum(y * mx.nd.log(yhat + 1e-6))

Stochastic Gradient Descent (SGD)


In [20]:
def SGD(params, lr):
    for param in params:
        param[:] = param - lr * param.grad

Accuracy calculations


In [21]:
def evaluate_accuracy(data_iterator, net):
    # Numerator stores number of correct prediction
    numerator = 0.
    # Denominator stores number of all samples
    denominator = 0.
    for i, (data, label) in enumerate(data_iterator):
        data = data.as_in_context(model_ctx).reshape((-1, 784))
        label = label.as_in_context(model_ctx)
        label_one_hot = mx.nd.one_hot(label, 10)
        output = net(data)
        predictions = mx.nd.argmax(output, 
                                axis=1)
        numerator += mx.nd.sum(predictions == label)
        denominator += data.shape[0]
    return (numerator / denominator).asscalar()

Accuracy of randomly initialized network


In [22]:
evaluate_accuracy(test_data, net)


Out[22]:
0.1167

Training


In [23]:
epochs = 5
learning_rate = .005

for e in range(epochs + 1):
    cumulative_loss = 0
    for i, (data, label) in enumerate(train_data):
        data = data.as_in_context(model_ctx).reshape((-1,784))
        label = label.as_in_context(model_ctx)
        label_one_hot = mx.nd.one_hot(label, 10)
        with mx.autograd.record():
            output = net(data)
            loss = cross_entropy(output, label_one_hot)
        loss.backward()
        SGD(params, learning_rate)
        cumulative_loss += mx.nd.sum(loss).asscalar()

    test_accuracy = evaluate_accuracy(test_data, net)
    train_accuracy = evaluate_accuracy(train_data, net)
    print("Epoch %s. Loss: %s, Train_acc %s, Test_acc %s" % (e, cumulative_loss / num_examples, train_accuracy, test_accuracy))


Epoch 0. Loss: 1.3976815716902415, Train_acc 0.8563833, Test_acc 0.8612
Epoch 1. Loss: 0.6160752163807551, Train_acc 0.88145, Test_acc 0.8845
Epoch 2. Loss: 0.5194016260226567, Train_acc 0.892, Test_acc 0.8901
Epoch 3. Loss: 0.46759953509966534, Train_acc 0.8972, Test_acc 0.8944
Epoch 4. Loss: 0.4347682308415572, Train_acc 0.9018833, Test_acc 0.8962
Epoch 5. Loss: 0.41056529971758526, Train_acc 0.90611666, Test_acc 0.9016

Prediction


In [24]:
# Define the function to do prediction
def model_predict(net,data):
    output = net(data)
    return mx.nd.argmax(output, axis=1)

In [25]:
# let's sample 10 random data points from the test set
sample_data = gluon.data.DataLoader(mnist_test, 10, shuffle=True)

In [26]:
for i, (data, label) in enumerate(sample_data):
    data = data.as_in_context(model_ctx)
    print(data.shape)
    im = mx.nd.transpose(data,(1,0,2,3))
    im = mx.nd.reshape(im,(28,10*28,1))
    imtiles = mx.nd.tile(im, (1,1,3))

    plt.imshow(imtiles.asnumpy())
    plt.show()
    pred=model_predict(net,data.reshape((-1,784)))
    print('model predictions are:', pred)
    break


(10, 28, 28, 1)
model predictions are: 
[4. 6. 9. 2. 1. 6. 6. 9. 6. 7.]
<NDArray 10 @cpu(0)>